# download_llt_issue.py
# LLT (Language Learning and Technology) Downloader
# Downloads PDFs from LLT issues via ScholarSpace bitstreams
# - Crawls /item/ article pages
# - Extracts iframe or <a> bitstream links
# - Skips Calls for Papers and Sponsors
# - Uses dynamic safe truncation for filenames

import requests
from bs4 import BeautifulSoup
import os
import re
from urllib.parse import urljoin

MAX_PATH = 240  # Safe margin for Windows paths

def sanitize(title, folder, extension=".pdf"):
    # Remove illegal characters
    title = re.sub(r'[\\/:*?"<>|]', '', title).strip()
    # Ensure total path length stays under Windows limit
    full_path = os.path.join(folder, title + extension)
    if len(full_path) > MAX_PATH:
        allowed_len = MAX_PATH - len(folder) - len(extension) - 1
        title = title[:allowed_len].strip()
    return title

issue_url = input("Enter LLT issue URL: ").strip()
headers = {"User-Agent": "Mozilla/5.0"}

resp = requests.get(issue_url, headers=headers)
soup = BeautifulSoup(resp.text, 'html.parser')

issue_title = soup.find("h2").get_text(strip=True)
folder = re.sub(r'[\\/:*?"<>|]', '', "LLT_" + issue_title)
os.makedirs(folder, exist_ok=True)

base_url = "https://www.lltjournal.org"
article_pages = []

# ✅ Step 1: Collect article pages
for a in soup.find_all('a', href=True):
    href = a['href']
    title_text = a.get_text(strip=True)

    # 🚫 Skip unwanted
    skip_terms = ["Call for Papers", "Previous Issues", "Announcements and news from our sponsors"]
    if any(term.lower() in title_text.lower() for term in skip_terms):
        continue

    if href.startswith("/item/10125-"):
        full_url = urljoin(base_url, href)

        # Prefer <strong> tag content if available
        strong_tag = a.find('strong')
        if strong_tag:
            clean_title = strong_tag.get_text(strip=True)
        else:
            clean_title = title_text

        article_pages.append((clean_title, full_url))

print(f"Found {len(article_pages)} article pages.")

pdf_links = []

# ✅ Step 2: Visit article pages and get PDF link
for title, page_url in article_pages:
    print(f"Visiting: {title}")
    r = requests.get(page_url, headers=headers)
    page_soup = BeautifulSoup(r.text, 'html.parser')

    pdf_url = None
    for link in page_soup.find_all('a', href=True):
        if "bitstreams" in link['href']:
            pdf_url = link['href']
            break

    if not pdf_url:
        iframe = page_soup.find('iframe', src=True)
        if iframe and "bitstreams" in iframe['src']:
            pdf_url = iframe['src']

    if pdf_url:
        if pdf_url.startswith("/"):
            pdf_url = urljoin(page_url, pdf_url)
        pdf_links.append((title, pdf_url))
    else:
        print(f"⚠️ No PDF link found for {title}")

# ✅ Step 3: Download PDFs
for title, pdf_url in pdf_links:
    safe_title = sanitize(title, folder)
    fname = safe_title + ".pdf"
    path = os.path.join(folder, fname)

    print(f"Downloading: {safe_title}")
    r = requests.get(pdf_url, headers=headers)
    if r.status_code == 200 and r.headers.get('content-type', '').lower().startswith('application/pdf'):
        with open(path, "wb") as f:
            f.write(r.content)
        print(f"Saved: {fname}")
    else:
        print(f"FAILED: {title} ({pdf_url})")

print(f"\nAll done! {len(pdf_links)} PDFs downloaded into {folder}.")
